{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MAXP 2021初赛数据探索和处理-1\n",
    "\n",
    "由于节点的Feature维度比较高，所以先处理节点的ID，并保存。Feature的处理放到第2部分。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path\n",
    "base_path = '/Users/jamezhan/PycharmProjects/MAXP/final_dataset'\n",
    "publish_path = 'publish'\n",
    "\n",
    "link_p1_path = os.path.join(base_path, publish_path, 'link_phase1.csv')\n",
    "train_nodes_path = os.path.join(base_path, publish_path, 'train_nodes.csv')\n",
    "val_nodes_path = os.path.join(base_path, publish_path, 'validation_nodes.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取边列表并统计节点数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(29168650, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id</th>\n",
       "      <th>reference_paper_id</th>\n",
       "      <th>phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f10da75ad1eaf16eb2ffe0d85b76b332</td>\n",
       "      <td>711ef25bdb2c2421c0131af77b3ede1d</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9ac5a4327bd4f3dcb424c93ca9b84087</td>\n",
       "      <td>2d91c73304c5e8a94a0e5b4956093f71</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9d91bfd4703e55dd814dfffb3d63fc33</td>\n",
       "      <td>33d4fdfe3967a1ffde9311bfe6827ef9</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>e1bdbce05528952ed6579795373782d4</td>\n",
       "      <td>4bda690abec912b3b7b228b01fb6819a</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>eb623ac4b10df96835921edabbde2951</td>\n",
       "      <td>c1a05bdfc88a73bf2830e705b2f39dbb</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           paper_id                reference_paper_id   phase\n",
       "0  f10da75ad1eaf16eb2ffe0d85b76b332  711ef25bdb2c2421c0131af77b3ede1d  phase1\n",
       "1  9ac5a4327bd4f3dcb424c93ca9b84087  2d91c73304c5e8a94a0e5b4956093f71  phase1\n",
       "2  9d91bfd4703e55dd814dfffb3d63fc33  33d4fdfe3967a1ffde9311bfe6827ef9  phase1\n",
       "3  e1bdbce05528952ed6579795373782d4  4bda690abec912b3b7b228b01fb6819a  phase1\n",
       "4  eb623ac4b10df96835921edabbde2951  c1a05bdfc88a73bf2830e705b2f39dbb  phase1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edge_df = pd.read_csv(link_p1_path)\n",
    "print(edge_df.shape)\n",
    "edge_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     29168650\n",
       "unique           1\n",
       "top         phase1\n",
       "freq      29168650\n",
       "Name: phase, dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edge_df.phase.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3031367, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f10da75ad1eaf16eb2ffe0d85b76b332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9ac5a4327bd4f3dcb424c93ca9b84087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9d91bfd4703e55dd814dfffb3d63fc33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>e1bdbce05528952ed6579795373782d4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           paper_id\n",
       "0  f10da75ad1eaf16eb2ffe0d85b76b332\n",
       "1  9ac5a4327bd4f3dcb424c93ca9b84087\n",
       "2  9d91bfd4703e55dd814dfffb3d63fc33\n",
       "3  e1bdbce05528952ed6579795373782d4"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes = pd.concat([edge_df['paper_id'], edge_df['reference_paper_id']])\n",
    "nodes = pd.DataFrame(nodes.drop_duplicates())\n",
    "nodes.rename(columns={0:'paper_id'}, inplace=True)\n",
    "\n",
    "print(nodes.shape)\n",
    "nodes.head(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 在边列表，一共出现了3,031,367个节点(paper_id)\n",
    "\n",
    "## 读取并查看train_nodes和validation_nodes里面的节点"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_node(line):\n",
    "    nid, feat_json, label = line.strip().split('\\\"')\n",
    "    \n",
    "    feat_list = [float(feat[1:-1]) for feat in feat_json[1:-1].split(', ')]\n",
    "    \n",
    "    if len(feat_list) != 300:\n",
    "        print('此行数据有问题 {}'.format(line))\n",
    "    \n",
    "    return nid[:-1], feat_list, label[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 100000 train rows\n",
      "Processed 200000 train rows\n",
      "Processed 300000 train rows\n",
      "Processed 400000 train rows\n",
      "Processed 500000 train rows\n",
      "Processed 600000 train rows\n",
      "Processed 700000 train rows\n",
      "Processed 800000 train rows\n",
      "Processed 900000 train rows\n",
      "Processed 1000000 train rows\n",
      "Processed 1100000 train rows\n",
      "Processed 1200000 train rows\n",
      "Processed 1300000 train rows\n",
      "Processed 1400000 train rows\n",
      "Processed 1500000 train rows\n",
      "Processed 1600000 train rows\n",
      "Processed 1700000 train rows\n",
      "Processed 1800000 train rows\n",
      "Processed 1900000 train rows\n",
      "Processed 2000000 train rows\n",
      "Processed 2100000 train rows\n",
      "Processed 2200000 train rows\n",
      "Processed 2300000 train rows\n",
      "Processed 2400000 train rows\n",
      "Processed 2500000 train rows\n",
      "Processed 2600000 train rows\n",
      "Processed 2700000 train rows\n",
      "Processed 2800000 train rows\n",
      "Processed 2900000 train rows\n",
      "Processed 3000000 train rows\n",
      "Processed 100000 validation rows\n",
      "Processed 200000 validation rows\n",
      "Processed 300000 validation rows\n",
      "Processed 400000 validation rows\n",
      "Processed 500000 validation rows\n"
     ]
    }
   ],
   "source": [
    "# 先构建ID和Label的关系，保证ID的顺序和Feature的顺序一致即可\n",
    "nid_list = []\n",
    "label_list = []\n",
    "tr_val_list = []\n",
    "\n",
    "with open(train_nodes_path, 'r') as f:\n",
    "    i = 0\n",
    "    \n",
    "    for line in f:\n",
    "        if i > 0:\n",
    "            nid, _, label = process_node(line)\n",
    "            nid_list.append(nid)\n",
    "            label_list.append(label)\n",
    "            tr_val_list.append(0)             # 0表示train的点\n",
    "        i += 1\n",
    "        if i % 100000 == 0:\n",
    "            print('Processed {} train rows'.format(i))\n",
    "\n",
    "with open(val_nodes_path, 'r') as f:\n",
    "    i = 0\n",
    "    \n",
    "    for line in f:\n",
    "        if i > 0:\n",
    "            nid, _, label = process_node(line)\n",
    "            nid_list.append(nid)\n",
    "            label_list.append(label)\n",
    "            tr_val_list.append(1)             # 1表示validation的点\n",
    "        i += 1\n",
    "        if i % 100000 == 0:\n",
    "            print('Processed {} validation rows'.format(i))\n",
    "            \n",
    "nid_arr = np.array(nid_list)\n",
    "label_arr = np.array(label_list)\n",
    "tr_val_arr = np.array(tr_val_list)\n",
    "    \n",
    "nid_label_df = pd.DataFrame({'paper_id':nid_arr, 'Label': label_arr, 'Split_ID':tr_val_arr})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3655033, 4)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_idx</th>\n",
       "      <th>paper_id</th>\n",
       "      <th>Label</th>\n",
       "      <th>Split_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>bfdee5ab86ef5e68da974d48a138c28e</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>78f43b8b62f040347fec0be44e5f08bd</td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>a971601a0286d2701aa5cde46e63a9fd</td>\n",
       "      <td>G</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>ac4b88a72146bae66cedfd1c13e1552d</td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   node_idx                          paper_id Label  Split_ID\n",
       "0         0  bfdee5ab86ef5e68da974d48a138c28e     S         0\n",
       "1         1  78f43b8b62f040347fec0be44e5f08bd               0\n",
       "2         2  a971601a0286d2701aa5cde46e63a9fd     G         0\n",
       "3         3  ac4b88a72146bae66cedfd1c13e1552d               0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nid_label_df.reset_index(inplace=True)\n",
    "nid_label_df.rename(columns={'index':'node_idx'}, inplace=True)\n",
    "print(nid_label_df.shape)\n",
    "nid_label_df.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3655033,)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查ID在Train和Validation是否有重复\n",
    "ids = nid_label_df.paper_id.drop_duplicates()\n",
    "ids.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### train和validation一共有3,655,033个节点"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 下面交叉比对边列表里的paper id和节点列表里的ID，检查是否有匹配不上的节点"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3030948, 4)\n"
     ]
    }
   ],
   "source": [
    "inboth = nid_label_df.merge(nodes, on='paper_id', how='inner')\n",
    "print(inboth.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3031367, 4)\n",
      "共有419边列表的节点在给出的节点列表里没有对应，缺乏特征\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id</th>\n",
       "      <th>node_idx</th>\n",
       "      <th>Label</th>\n",
       "      <th>Split_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1124</th>\n",
       "      <td>cc388eaec8838ce383d8a8792014fedb</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1184</th>\n",
       "      <td>5d899f41e52f751fef843cf7b1d05b4a</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14342</th>\n",
       "      <td>2b2004ec3c99a44b5cb6045ca547453e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15803</th>\n",
       "      <td>d657c4451a9617f4eec96d3b2e6092c7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               paper_id  node_idx Label  Split_ID\n",
       "1124   cc388eaec8838ce383d8a8792014fedb       NaN   NaN       NaN\n",
       "1184   5d899f41e52f751fef843cf7b1d05b4a       NaN   NaN       NaN\n",
       "14342  2b2004ec3c99a44b5cb6045ca547453e       NaN   NaN       NaN\n",
       "15803  d657c4451a9617f4eec96d3b2e6092c7       NaN   NaN       NaN"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edge_node = nodes.merge(nid_label_df, on='paper_id', how='left')\n",
    "print(edge_node.shape)\n",
    "print('共有{}边列表的节点在给出的节点列表里没有对应，缺乏特征'.format(edge_node[edge_node.node_idx.isna()].shape[0]))\n",
    "edge_node[edge_node.node_idx.isna()].head(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 合并边列表里独特的节点和train和validation的节点到一起，构成全部节点列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jamezhan/anaconda3/envs/dgl/lib/python3.6/site-packages/ipykernel_launcher.py:3: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n",
      "/Users/jamezhan/anaconda3/envs/dgl/lib/python3.6/site-packages/pandas/core/generic.py:5170: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self[name] = value\n",
      "/Users/jamezhan/anaconda3/envs/dgl/lib/python3.6/site-packages/pandas/core/frame.py:4174: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  errors=errors,\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_idx</th>\n",
       "      <th>paper_id</th>\n",
       "      <th>Label</th>\n",
       "      <th>Split_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3655033</td>\n",
       "      <td>cc388eaec8838ce383d8a8792014fedb</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3655034</td>\n",
       "      <td>5d899f41e52f751fef843cf7b1d05b4a</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3655035</td>\n",
       "      <td>2b2004ec3c99a44b5cb6045ca547453e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3655036</td>\n",
       "      <td>d657c4451a9617f4eec96d3b2e6092c7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   node_idx                          paper_id Label  Split_ID\n",
       "0   3655033  cc388eaec8838ce383d8a8792014fedb   NaN         1\n",
       "1   3655034  5d899f41e52f751fef843cf7b1d05b4a   NaN         1\n",
       "2   3655035  2b2004ec3c99a44b5cb6045ca547453e   NaN         1\n",
       "3   3655036  d657c4451a9617f4eec96d3b2e6092c7   NaN         1"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 获取未能匹配上的节点，并构建新的节点DataFrame，然后和原有的Train/Validation节点Concat起来\n",
    "diff_nodes = edge_node[edge_node.node_idx.isna()]\n",
    "diff_nodes.ID = diff_nodes.paper_id\n",
    "diff_nodes.Split_ID = 1\n",
    "diff_nodes.node_idx = 0\n",
    "diff_nodes.reset_index(inplace=True)\n",
    "diff_nodes.drop(['index'], axis=1, inplace=True)\n",
    "diff_nodes.node_idx = diff_nodes.node_idx + diff_nodes.index + 3655033\n",
    "diff_nodes = diff_nodes[['node_idx', 'paper_id', 'Label', 'Split_ID']]\n",
    "diff_nodes.head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_idx</th>\n",
       "      <th>paper_id</th>\n",
       "      <th>Label</th>\n",
       "      <th>Split_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>415</th>\n",
       "      <td>3655448</td>\n",
       "      <td>caed47d55d1e193ecb1fa97a415c13dd</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>416</th>\n",
       "      <td>3655449</td>\n",
       "      <td>c82eb6be79a245392fb626b9a7e1f246</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>417</th>\n",
       "      <td>3655450</td>\n",
       "      <td>926a31f6b378575204aae30b5dfa6dd3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>418</th>\n",
       "      <td>3655451</td>\n",
       "      <td>bbace2419c3f827158ea4602f3eb35fa</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     node_idx                          paper_id Label  Split_ID\n",
       "415   3655448  caed47d55d1e193ecb1fa97a415c13dd   NaN         1\n",
       "416   3655449  c82eb6be79a245392fb626b9a7e1f246   NaN         1\n",
       "417   3655450  926a31f6b378575204aae30b5dfa6dd3   NaN         1\n",
       "418   3655451  bbace2419c3f827158ea4602f3eb35fa   NaN         1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Concatenate这419个未匹配到的节点到总的node的最后，从而让nid能接上\n",
    "nid_label_df = pd.concat([nid_label_df, diff_nodes])\n",
    "nid_label_df.tail(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存ID和Label到本地文件\n",
    "nid_label_df.to_csv(os.path.join(base_path, publish_path, './IDandLabels.csv'), index=False)\n",
    "# 保存未匹配上的节点用于feature的处理\n",
    "diff_nodes.to_csv(os.path.join(base_path, publish_path, './diff_nodes.csv'), index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dgl]",
   "language": "python",
   "name": "conda-env-dgl-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
